{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import scanpy as sc\n",
    "import os\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from anndata import AnnData, read_csv\n",
    "import seaborn as sns\n",
    "from matplotlib import pyplot as plt\n",
    "from glob import glob"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "loc = './data/'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create Zebrafish AnnData"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Assign annotations from SAMap\n",
    "\n",
    "Download the raw zebrafish h5ad File and map annotations from [elife paper](https://elifesciences.org/articles/66747)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2022-11-07 14:34:27--  https://kleintools.hms.harvard.edu/paper_websites/wagner_zebrafish_timecourse2018/WagnerScience2018.h5ad\n",
      "Resolving kleintools.hms.harvard.edu (kleintools.hms.harvard.edu)... 134.174.159.103\n",
      "Connecting to kleintools.hms.harvard.edu (kleintools.hms.harvard.edu)|134.174.159.103|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 258580699 (247M)\n",
      "Saving to: ‘./data/WagnerScience2018.h5ad’\n",
      "\n",
      "./data/WagnerScienc 100%[===================>] 246.60M  5.53MB/s    in 28s     \n",
      "\n",
      "2022-11-07 14:34:56 (8.81 MB/s) - ‘./data/WagnerScience2018.h5ad’ saved [258580699/258580699]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!wget -O ./data/WagnerScience2018.h5ad https://kleintools.hms.harvard.edu/paper_websites/wagner_zebrafish_timecourse2018/WagnerScience2018.h5ad "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "zebrafish = sc.read(os.path.join(loc,'WagnerScience2018.h5ad'))\n",
    "zebrafish.obs['cluster'] = pd.Categorical([z[6:] if '-' in z else z for z in zebrafish.obs['ClusterName']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(os.path.join(loc,'zebrafish_cell_types_mapping')) as f:\n",
    "    cell_types_mapping = f.readlines()\n",
    "ct_map = {}\n",
    "for line in cell_types_mapping[1:]:\n",
    "    el = line.split(\"\\t\")\n",
    "    ct_map[el[0].strip()] = el[1].strip()\n",
    "ct_map['periderm'] = 'Periderm'\n",
    "ct_map['pluripotent'] = 'Pluripotent'\n",
    "ct_map['neural - floorplate posterior'] = 'Notoplate'\n",
    "ct_map['neural crest - mcamb'] = 'Neural crest'\n",
    "ct_map['neural crest - melanoblast'] = 'Neural crest'\n",
    "ct_map['neural crest - iridoblast'] = 'Neural crest'\n",
    "ct_map['neural crest - xanthophore'] = 'Neural crest'\n",
    "ct_map['neural crest - crestin'] = 'Neural crest'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'EVL', 'NaN'}\n"
     ]
    }
   ],
   "source": [
    "samap_clusters = []\n",
    "not_found = []\n",
    "for f in zebrafish.obs['cluster']:\n",
    "    if f.strip() not in ct_map:\n",
    "        samap_clusters.append('NaN')\n",
    "        not_found.append(f)\n",
    "    else:\n",
    "        samap_clusters.append(ct_map[f.strip()])\n",
    "print(set(not_found))\n",
    "zebrafish.obs['cell_type'] = pd.Categorical(samap_clusters)\n",
    "zebrafish = zebrafish[zebrafish.obs['cell_type']!='NaN']\n",
    "zebrafish.write(os.path.join(loc,'zebrafish_annot.h5ad'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load zebrafish data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "zebrafish = sc.read(os.path.join(loc,'zebrafish_annot.h5ad'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>n_counts</th>\n",
       "      <th>unique_cell_id</th>\n",
       "      <th>cell_names</th>\n",
       "      <th>library_id</th>\n",
       "      <th>batch</th>\n",
       "      <th>ClusterID</th>\n",
       "      <th>ClusterName</th>\n",
       "      <th>TissueID</th>\n",
       "      <th>TissueName</th>\n",
       "      <th>TimeID</th>\n",
       "      <th>cluster</th>\n",
       "      <th>cell_type</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>index</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0-0-0</th>\n",
       "      <td>15111.0</td>\n",
       "      <td>DEW050_AGTCAATAC-TTGGATCG</td>\n",
       "      <td>bcGPGV</td>\n",
       "      <td>DEW050</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>04hpf-pluripotent</td>\n",
       "      <td>9</td>\n",
       "      <td>Pluripotent</td>\n",
       "      <td>4hpf</td>\n",
       "      <td>pluripotent</td>\n",
       "      <td>Pluripotent</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1-0-0</th>\n",
       "      <td>2337.0</td>\n",
       "      <td>DEW050_AAGAACGGG-GCGTTGCT</td>\n",
       "      <td>bcDSDI</td>\n",
       "      <td>DEW050</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>04hpf-pluripotent</td>\n",
       "      <td>9</td>\n",
       "      <td>Pluripotent</td>\n",
       "      <td>4hpf</td>\n",
       "      <td>pluripotent</td>\n",
       "      <td>Pluripotent</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2-0-0</th>\n",
       "      <td>2078.0</td>\n",
       "      <td>DEW050_GACCTACTAG-TTAGTCCG</td>\n",
       "      <td>bcENHV</td>\n",
       "      <td>DEW050</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>04hpf-pluripotent</td>\n",
       "      <td>9</td>\n",
       "      <td>Pluripotent</td>\n",
       "      <td>4hpf</td>\n",
       "      <td>pluripotent</td>\n",
       "      <td>Pluripotent</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3-0-0</th>\n",
       "      <td>1648.0</td>\n",
       "      <td>DEW050_GTTTGTTT-GGTCCCTT</td>\n",
       "      <td>bcAABE</td>\n",
       "      <td>DEW050</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>04hpf-pluripotent</td>\n",
       "      <td>9</td>\n",
       "      <td>Pluripotent</td>\n",
       "      <td>4hpf</td>\n",
       "      <td>pluripotent</td>\n",
       "      <td>Pluripotent</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4-0-0</th>\n",
       "      <td>1153.0</td>\n",
       "      <td>DEW050_TGATTGCACGC-TAACCATC</td>\n",
       "      <td>bcFTTU</td>\n",
       "      <td>DEW050</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>04hpf-pluripotent</td>\n",
       "      <td>9</td>\n",
       "      <td>Pluripotent</td>\n",
       "      <td>4hpf</td>\n",
       "      <td>pluripotent</td>\n",
       "      <td>Pluripotent</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1993-28-6</th>\n",
       "      <td>2525.0</td>\n",
       "      <td>DEW169_ATTTCCAT-CAGTCCCT</td>\n",
       "      <td>bcDANY</td>\n",
       "      <td>DEW169</td>\n",
       "      <td>6</td>\n",
       "      <td>135</td>\n",
       "      <td>24hpf-muscle - myl1</td>\n",
       "      <td>8</td>\n",
       "      <td>Mesoderm</td>\n",
       "      <td>24hpf</td>\n",
       "      <td>muscle - myl1</td>\n",
       "      <td>Skeletal muscle</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1994-28-6</th>\n",
       "      <td>1548.0</td>\n",
       "      <td>DEW169_CTACGGGA-ATACCCAG</td>\n",
       "      <td>bcELQP</td>\n",
       "      <td>DEW169</td>\n",
       "      <td>6</td>\n",
       "      <td>157</td>\n",
       "      <td>24hpf-differentiating neurons - eomesa</td>\n",
       "      <td>1</td>\n",
       "      <td>Forebrain / Optic</td>\n",
       "      <td>24hpf</td>\n",
       "      <td>differentiating neurons - eomesa</td>\n",
       "      <td>Neuron</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-28-6</th>\n",
       "      <td>5054.0</td>\n",
       "      <td>DEW169_TGGAAAGC-CCGCAACT</td>\n",
       "      <td>bcIGIF</td>\n",
       "      <td>DEW169</td>\n",
       "      <td>6</td>\n",
       "      <td>138</td>\n",
       "      <td>24hpf-neural - diencephalon</td>\n",
       "      <td>1</td>\n",
       "      <td>Forebrain / Optic</td>\n",
       "      <td>24hpf</td>\n",
       "      <td>neural - diencephalon</td>\n",
       "      <td>Forebrain/midbrain</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2001-28-6</th>\n",
       "      <td>3270.0</td>\n",
       "      <td>DEW169_GAAAGACA-GTCCGTAC</td>\n",
       "      <td>bcFBRF</td>\n",
       "      <td>DEW169</td>\n",
       "      <td>6</td>\n",
       "      <td>178</td>\n",
       "      <td>24hpf-pharyngeal arch - ndnf</td>\n",
       "      <td>8</td>\n",
       "      <td>Mesoderm</td>\n",
       "      <td>24hpf</td>\n",
       "      <td>pharyngeal arch - ndnf</td>\n",
       "      <td>Intermediate mesoderm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2002-28-6</th>\n",
       "      <td>2813.0</td>\n",
       "      <td>DEW169_CCTCATGA-CTAGTAAC</td>\n",
       "      <td>bcDZHN</td>\n",
       "      <td>DEW169</td>\n",
       "      <td>6</td>\n",
       "      <td>136</td>\n",
       "      <td>24hpf-neural - midbrain</td>\n",
       "      <td>2</td>\n",
       "      <td>Midbrain</td>\n",
       "      <td>24hpf</td>\n",
       "      <td>neural - midbrain</td>\n",
       "      <td>Forebrain/midbrain</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>63419 rows × 12 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           n_counts               unique_cell_id cell_names library_id batch  \\\n",
       "index                                                                          \n",
       "0-0-0       15111.0    DEW050_AGTCAATAC-TTGGATCG     bcGPGV     DEW050     0   \n",
       "1-0-0        2337.0    DEW050_AAGAACGGG-GCGTTGCT     bcDSDI     DEW050     0   \n",
       "2-0-0        2078.0   DEW050_GACCTACTAG-TTAGTCCG     bcENHV     DEW050     0   \n",
       "3-0-0        1648.0     DEW050_GTTTGTTT-GGTCCCTT     bcAABE     DEW050     0   \n",
       "4-0-0        1153.0  DEW050_TGATTGCACGC-TAACCATC     bcFTTU     DEW050     0   \n",
       "...             ...                          ...        ...        ...   ...   \n",
       "1993-28-6    2525.0     DEW169_ATTTCCAT-CAGTCCCT     bcDANY     DEW169     6   \n",
       "1994-28-6    1548.0     DEW169_CTACGGGA-ATACCCAG     bcELQP     DEW169     6   \n",
       "2000-28-6    5054.0     DEW169_TGGAAAGC-CCGCAACT     bcIGIF     DEW169     6   \n",
       "2001-28-6    3270.0     DEW169_GAAAGACA-GTCCGTAC     bcFBRF     DEW169     6   \n",
       "2002-28-6    2813.0     DEW169_CCTCATGA-CTAGTAAC     bcDZHN     DEW169     6   \n",
       "\n",
       "          ClusterID                             ClusterName TissueID  \\\n",
       "index                                                                  \n",
       "0-0-0             1                       04hpf-pluripotent        9   \n",
       "1-0-0             1                       04hpf-pluripotent        9   \n",
       "2-0-0             1                       04hpf-pluripotent        9   \n",
       "3-0-0             1                       04hpf-pluripotent        9   \n",
       "4-0-0             1                       04hpf-pluripotent        9   \n",
       "...             ...                                     ...      ...   \n",
       "1993-28-6       135                     24hpf-muscle - myl1        8   \n",
       "1994-28-6       157  24hpf-differentiating neurons - eomesa        1   \n",
       "2000-28-6       138            24hpf-neural - diencephalon         1   \n",
       "2001-28-6       178            24hpf-pharyngeal arch - ndnf        8   \n",
       "2002-28-6       136                 24hpf-neural - midbrain        2   \n",
       "\n",
       "                  TissueName TimeID                           cluster  \\\n",
       "index                                                                   \n",
       "0-0-0            Pluripotent   4hpf                       pluripotent   \n",
       "1-0-0            Pluripotent   4hpf                       pluripotent   \n",
       "2-0-0            Pluripotent   4hpf                       pluripotent   \n",
       "3-0-0            Pluripotent   4hpf                       pluripotent   \n",
       "4-0-0            Pluripotent   4hpf                       pluripotent   \n",
       "...                      ...    ...                               ...   \n",
       "1993-28-6           Mesoderm  24hpf                     muscle - myl1   \n",
       "1994-28-6  Forebrain / Optic  24hpf  differentiating neurons - eomesa   \n",
       "2000-28-6  Forebrain / Optic  24hpf            neural - diencephalon    \n",
       "2001-28-6           Mesoderm  24hpf            pharyngeal arch - ndnf   \n",
       "2002-28-6           Midbrain  24hpf                 neural - midbrain   \n",
       "\n",
       "                       cell_type  \n",
       "index                             \n",
       "0-0-0                Pluripotent  \n",
       "1-0-0                Pluripotent  \n",
       "2-0-0                Pluripotent  \n",
       "3-0-0                Pluripotent  \n",
       "4-0-0                Pluripotent  \n",
       "...                          ...  \n",
       "1993-28-6        Skeletal muscle  \n",
       "1994-28-6                 Neuron  \n",
       "2000-28-6     Forebrain/midbrain  \n",
       "2001-28-6  Intermediate mesoderm  \n",
       "2002-28-6     Forebrain/midbrain  \n",
       "\n",
       "[63419 rows x 12 columns]"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "zebrafish.obs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "zebrafish.obs_names = [x for x in zebrafish.obs_names ]\n",
    "zebrafish.var_names = [x for x in zebrafish.var_names ]\n",
    "zebrafish.obs.cell_type = [x for x in zebrafish.obs.cell_type ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.pp.filter_cells(zebrafish, min_genes=500)\n",
    "sc.pp.filter_genes(zebrafish, min_cells=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3678.0"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "zebrafish.X.toarray().max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "zebrafish.write(os.path.join(loc, \"zebrafish.h5ad\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "AnnData object with n_obs × n_vars = 63371 × 30032\n",
       "    obs: 'n_counts', 'unique_cell_id', 'cell_names', 'library_id', 'batch', 'ClusterID', 'ClusterName', 'TissueID', 'TissueName', 'TimeID', 'cluster', 'cell_type', 'n_genes'\n",
       "    var: 'n_cells'"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "zebrafish"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create Frog AnnData\n",
    "Download the raw frog h5ad File and map annotations from [elife paper](https://elifesciences.org/articles/66747)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Download the Data from GSE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2022-11-07 14:41:36--  https://ftp.ncbi.nlm.nih.gov/geo/series/GSE113nnn/GSE113074/suppl/GSE113074_Raw_combined.annotated_counts.tsv.gz\n",
      "Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 165.112.9.229, 165.112.9.230, 2607:f220:41f:250::228, ...\n",
      "Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.229|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 177842456 (170M) [application/x-gzip]\n",
      "Saving to: ‘./data/GSE113074_Raw_combined.annotated_counts.tsv’\n",
      "\n",
      "./data/GSE113074_Ra 100%[===================>] 169.60M  9.61MB/s    in 20s     \n",
      "\n",
      "2022-11-07 14:41:57 (8.52 MB/s) - ‘./data/GSE113074_Raw_combined.annotated_counts.tsv’ saved [177842456/177842456]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!wget -O ./data/GSE113074_Raw_combined.annotated_counts.tsv.gz \"https://ftp.ncbi.nlm.nih.gov/geo/series/GSE113nnn/GSE113074/suppl/GSE113074_Raw_combined.annotated_counts.tsv.gz\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "!gunzip ./data/GSE113074_Raw_combined.annotated_counts.tsv.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_frog_h5ad():\n",
    "    filepath_frog = os.path.join(loc,'GSE113074_Raw_combined.annotated_counts.tsv')\n",
    "    with open(filepath_frog) as f:\n",
    "        frog_data = f.readlines()\n",
    "    barcodes = [f for f in frog_data[5].split(\"\\t\")]\n",
    "    libraries = [f.strip() for f in frog_data[3].split(\"\\t\")]\n",
    "    parent_clusters = [f[4:].strip() if f.startswith('S') else f for f in frog_data[8].split(\"\\t\")]\n",
    "    clusters = [f[4:].strip() if f.startswith('S') else f for f in frog_data[7].split(\"\\t\")]\n",
    "    dev_stage = [f.strip() for f in frog_data[6].split(\"\\t\")]\n",
    "    genes = []\n",
    "    data = np.zeros((len(frog_data[9:]),len(barcodes)-1))\n",
    "    for i, f in enumerate(frog_data[9:]):\n",
    "        line = f.split(\"\\t\")\n",
    "        genes.append(line[0])\n",
    "        data[i, :] = [np.float(l) for j,l in enumerate(line[1:])]\n",
    "    \n",
    "    # create anndata\n",
    "    adata = AnnData(data.transpose())\n",
    "    adata.var_names = genes\n",
    "    adata.obs_names = barcodes[1:]\n",
    "    adata.obs[\"library\"] = libraries[1:]\n",
    "    adata.obs['clusters'] = clusters[1:]\n",
    "    adata.obs['dev_stage'] = dev_stage[1:]\n",
    "    adata.obs['parent_clusters'] = parent_clusters[1:]\n",
    "    # add samap cell type annotations\n",
    "    with open(os.path.join(loc, 'frog_cell_types_mapping')) as f:\n",
    "        cell_types_mapping = f.readlines()\n",
    "    ct_map = {}\n",
    "    for line in cell_types_mapping[1:]:\n",
    "        el = line.split(\"\\t\")\n",
    "        ct_map[el[0].strip()] = el[1].strip()\n",
    "    ct_map['Outlier'] = 'Outlier'\n",
    "    samap_clusters = []\n",
    "    for f in adata.obs['clusters']:\n",
    "        if f.strip() not in ct_map:\n",
    "            print(f)\n",
    "        samap_clusters.append(ct_map[f.strip()])\n",
    "    adata.obs['cell_type'] = samap_clusters\n",
    "    \n",
    "    adata.write(os.path.join(loc,'GSE113074_Corrected_combined.annotated_counts.h5ad'))\n",
    "    return adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "frog = generate_frog_h5ad()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "frog_annot = sc.read_h5ad(os.path.join(loc,'GSE113074_Corrected_combined.annotated_counts.h5ad'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "frog_counts = frog"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "AnnData object with n_obs × n_vars = 136966 × 26550\n",
       "    obs: 'library', 'clusters', 'dev_stage', 'parent_clusters', 'cell_type'"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "frog_annot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "frog = frog[frog.obs['cell_type']!='Outlier']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "frog.obs_names = [x for x in frog.obs_names ]\n",
    "frog.var_names = [x for x in frog.var_names ]\n",
    "frog.obs.cell_type = [x for x in frog.obs.cell_type ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.pp.filter_cells(frog, min_genes=500)\n",
    "sc.pp.filter_genes(frog, min_cells=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 4.,  1.,  0., ...,  2.,  0., 10.],\n",
       "       [ 1.,  0.,  0., ...,  0.,  0.,  5.],\n",
       "       [ 1.,  7.,  0., ...,  0.,  0.,  1.],\n",
       "       ...,\n",
       "       [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "frog.X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "frog.write(os.path.join(loc, \"frog.h5ad\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
